/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

#include "pagespeed/kernel/html/html_keywords.h"

#include <algorithm>
#include <cstddef>
#include <map>
#include <utility>

#include "base/logging.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/writer.h"
#include "pagespeed/kernel/html/html_name.h"

namespace net_instaweb {

namespace {

struct HtmlKeywordsSequence {
  const char* sequence;
  const unsigned char value[3];
};

// TODO(jmarantz): the multi-byte sequences are not working yet.
HtmlKeywordsSequence kHtmlKeywordsSequences[] = {
    {"AElig", {0xC6, 0x0}},  {"Aacute", {0xC1, 0x0}}, {"Acirc", {0xC2, 0x0}},
    {"Agrave", {0xC0, 0x0}}, {"Aring", {0xC5, 0x0}},  {"Atilde", {0xC3, 0x0}},
    {"Auml", {0xC4, 0x0}},   {"Ccedil", {0xC7, 0x0}}, {"ETH", {0xD0, 0x0}},
    {"Eacute", {0xC9, 0x0}}, {"Ecirc", {0xCA, 0x0}},  {"Egrave", {0xC8, 0x0}},
    {"Euml", {0xCB, 0x0}},   {"Iacute", {0xCD, 0x0}}, {"Icirc", {0xCE, 0x0}},
    {"Igrave", {0xCC, 0x0}}, {"Iuml", {0xCF, 0x0}},   {"Ntilde", {0xD1, 0x0}},
    {"Oacute", {0xD3, 0x0}}, {"Ocirc", {0xD4, 0x0}},  {"Ograve", {0xD2, 0x0}},
    {"Oslash", {0xD8, 0x0}}, {"Otilde", {0xD5, 0x0}}, {"Ouml", {0xD6, 0x0}},
    {"THORN", {0xDE, 0x0}},  {"Uacute", {0xDA, 0x0}}, {"Ucirc", {0xDB, 0x0}},
    {"Ugrave", {0xD9, 0x0}}, {"Uuml", {0xDC, 0x0}},   {"Yacute", {0xDD, 0x0}},
    {"aacute", {0xE1, 0x0}}, {"acirc", {0xE2, 0x0}},  {"acute", {0xB4, 0x0}},
    {"aelig", {0xE6, 0x0}},  {"agrave", {0xE0, 0x0}}, {"amp", {0x26, 0x0}},
    {"aring", {0xE5, 0x0}},  {"atilde", {0xE3, 0x0}}, {"auml", {0xE4, 0x0}},
    {"brvbar", {0xA6, 0x0}}, {"ccedil", {0xE7, 0x0}}, {"cedil", {0xB8, 0x0}},
    {"cent", {0xA2, 0x0}},   {"copy", {0xA9, 0x0}},   {"curren", {0xA4, 0x0}},
    {"deg", {0xB0, 0x0}},    {"divide", {0xF7, 0x0}}, {"eacute", {0xE9, 0x0}},
    {"ecirc", {0xEA, 0x0}},  {"egrave", {0xE8, 0x0}}, {"eth", {0xF0, 0x0}},
    {"euml", {0xEB, 0x0}},   {"frac12", {0xBD, 0x0}}, {"frac14", {0xBC, 0x0}},
    {"frac34", {0xBE, 0x0}}, {"gt", {0x3E, 0x0}},     {"iacute", {0xED, 0x0}},
    {"icirc", {0xEE, 0x0}},  {"iexcl", {0xA1, 0x0}},  {"igrave", {0xEC, 0x0}},
    {"iquest", {0xBF, 0x0}}, {"iuml", {0xEF, 0x0}},   {"laquo", {0xAB, 0x0}},
    {"lt", {0x3C, 0x0}},     {"macr", {0xAF, 0x0}},   {"micro", {0xB5, 0x0}},
    {"middot", {0xB7, 0x0}}, {"nbsp", {0xA0, 0x0}},   {"not", {0xAC, 0x0}},
    {"ntilde", {0xF1, 0x0}}, {"oacute", {0xF3, 0x0}}, {"ocirc", {0xF4, 0x0}},
    {"ograve", {0xF2, 0x0}}, {"ordf", {0xAA, 0x0}},   {"ordm", {0xBA, 0x0}},
    {"oslash", {0xF8, 0x0}}, {"otilde", {0xF5, 0x0}}, {"ouml", {0xF6, 0x0}},
    {"para", {0xB6, 0x0}},   {"plusmn", {0xB1, 0x0}}, {"pound", {0xA3, 0x0}},
    {"quot", {0x22, 0x0}},   {"raquo", {0xBB, 0x0}},  {"reg", {0xAE, 0x0}},
    {"sect", {0xA7, 0x0}},   {"shy", {0xAD, 0x0}},    {"sup1", {0xB9, 0x0}},
    {"sup2", {0xB2, 0x0}},   {"sup3", {0xB3, 0x0}},   {"szlig", {0xDF, 0x0}},
    {"thorn", {0xFE, 0x0}},  {"times", {0xD7, 0x0}},  {"uacute", {0xFA, 0x0}},
    {"ucirc", {0xFB, 0x0}},  {"ugrave", {0xF9, 0x0}}, {"uml", {0xA8, 0x0}},
    {"uuml", {0xFC, 0x0}},   {"yacute", {0xFD, 0x0}}, {"yen", {0xA5, 0x0}},
    {"yuml", {0xFF, 0x0}}};

// http://www.w3.org/TR/html4/sgml/entities.html contains a list of multi-byte
// codes.  When we see any of these in an HTML attribute, we cannot currently
// unescape it, because we have no general strategy for multi-byte encoding.
//
// TODO(jmarantz): 1. We could adopt UTF-8 as our in-memory representation for
// manipulating attribute values, independent of the source document encoding.
//
// TODO(jmarantz): 2. We could sense UTF-8 encoding on documents based
// on Content-Type &/or meta-tags & thereby allow parsing of HTML
// attributes with utf-8 encodings.  This is not strictly necessary
// for URL attributes where this is not allowed.  See
// http://www.w3.org/TR/html401/appendix/notes.html#h-B.2.
//
//
// See also: http://www.w3.org/TR/2011/WD-html5-20110113/
// named-character-references.html#named-character-references
// which appears to contradict other references in the validity of &apos; and
// likely other details.  This may be issue of HTML5 vs HTML4.

struct HtmlMultiByteSequence {
  const char* sequence;
  int value;  // ignored for now.
};

struct HtmlMultiByteSequenceCompare {
  // TODO(jmarantz): Write a version of StringCaseCompare based on const char*
  // to avoid extra calls to strlen.
  bool operator()(const HtmlMultiByteSequence& a, const char* b) const {
    return StringCaseCompare(a.sequence, b) < 0;
  }
  bool operator()(const char* a, const HtmlMultiByteSequence& b) const {
    return StringCaseCompare(a, b.sequence) < 0;
  }

  // Seems to be needed by MSVC in debug build.
  bool operator()(const HtmlMultiByteSequence& a,
                  const HtmlMultiByteSequence& b) const {
    return StringCaseCompare(a.sequence, b.sequence) < 0;
  }
};

HtmlMultiByteSequence kHtmlKeywordsSequencesMultiByte[] = {
    {"alefsym", 8501}, {"Alpha", 913},   {"alpha", 945},   {"and", 8743},
    {"ang", 8736},     {"asymp", 8776},  {"bdquo", 8222},  {"Beta", 914},
    {"beta", 946},     {"bull", 8226},   {"cap", 8745},    {"Chi", 935},
    {"chi", 967},      {"circ", 710},    {"clubs", 9827},  {"cong", 8773},
    {"crarr", 8629},   {"cup", 8746},    {"dagger", 8224}, {"Dagger", 8225},
    {"darr", 8595},    {"dArr", 8659},   {"Delta", 916},   {"delta", 948},
    {"diams", 9830},   {"empty", 8709},  {"emsp", 8195},   {"ensp", 8194},
    {"Epsilon", 917},  {"epsilon", 949}, {"equiv", 8801},  {"Eta", 919},
    {"eta", 951},      {"euro", 8364},   {"exist", 8707},  {"fnof", 402},
    {"forall", 8704},  {"frasl", 8260},  {"Gamma", 915},   {"gamma", 947},
    {"ge", 8805},      {"harr", 8596},   {"hArr", 8660},   {"hearts", 9829},
    {"hellip", 8230},  {"image", 8465},  {"infin", 8734},  {"int", 8747},
    {"Iota", 921},     {"iota", 953},    {"isin", 8712},   {"Kappa", 922},
    {"kappa", 954},    {"Lambda", 923},  {"lambda", 955},  {"lang", 9001},
    {"larr", 8592},    {"lArr", 8656},   {"lceil", 8968},  {"ldquo", 8220},
    {"le", 8804},      {"lfloor", 8970}, {"lowast", 8727}, {"loz", 9674},
    {"lrm", 8206},     {"lsaquo", 8249}, {"lsquo", 8216},  {"mdash", 8212},
    {"minus", 8722},   {"Mu", 924},      {"mu", 956},      {"nabla", 8711},
    {"ndash", 8211},   {"ne", 8800},     {"ni", 8715},     {"notin", 8713},
    {"nsub", 8836},    {"Nu", 925},      {"nu", 957},      {"OElig", 338},
    {"oelig", 339},    {"oline", 8254},  {"Omega", 937},   {"omega", 969},
    {"Omicron", 927},  {"omicron", 959}, {"oplus", 8853},  {"or", 8744},
    {"otimes", 8855},  {"part", 8706},   {"permil", 8240}, {"perp", 8869},
    {"Phi", 934},      {"phi", 966},     {"Pi", 928},      {"pi", 960},
    {"piv", 982},      {"prime", 8242},  {"Prime", 8243},  {"prod", 8719},
    {"prop", 8733},    {"Psi", 936},     {"psi", 968},     {"radic", 8730},
    {"rang", 9002},    {"rarr", 8594},   {"rArr", 8658},   {"rceil", 8969},
    {"rdquo", 8221},   {"real", 8476},   {"rfloor", 8971}, {"Rho", 929},
    {"rho", 961},      {"rlm", 8207},    {"rsaquo", 8250}, {"rsquo", 8217},
    {"sbquo", 8218},   {"Scaron", 352},  {"scaron", 353},  {"sdot", 8901},
    {"Sigma", 931},    {"sigma", 963},   {"sigmaf", 962},  {"sim", 8764},
    {"spades", 9824},  {"sub", 8834},    {"sube", 8838},   {"sum", 8721},
    {"sup", 8835},     {"supe", 8839},   {"Tau", 932},     {"tau", 964},
    {"there4", 8756},  {"Theta", 920},   {"theta", 952},   {"thetasym", 977},
    {"thinsp", 8201},  {"tilde", 732},   {"trade", 8482},  {"uarr", 8593},
    {"uArr", 8657},    {"upsih", 978},   {"Upsilon", 933}, {"upsilon", 965},
    {"weierp", 8472},  {"Xi", 926},      {"xi", 958},      {"Yuml", 376},
    {"Zeta", 918},     {"zeta", 950},    {"zwj", 8205},    {"zwnj", 8204},
};

// String constants used to populate maps at initialization time.
// These are a little more expressive than static arrays of keywords.
// The penalty for this expressiveness is lack of compile-time checking,
// and startup time.  But the compile-time checking is replaced by
// debug-only init checks.

// Tables are a 4-level hierarchy:
//   table > [thead tbody tfoot] > tr > [td th]
//
// Note: we use trailing spaces in all these strings so that they can
// be concatenated more easily.  Note that we use 'omit_empty_strings'
// when we interpret via SplitStringPieceToVector.
const char kTableLeaves[] = "td th ";
const char kTableSections[] = "tbody tfoot thead ";
const char kTableElements[] = "td th tbody tfoot thead table tr ";
// TODO(jmarantz): consider caption, col, colgroup.

// Formatting elements are terminated by many other tags.
const char kFormattingElements[] =
    "b i em font strong small s cite q dfn abbr time code var "
    "samp kbd sub u mark bdi bdo ";
// TODO(jmarantz): consider ins and del & potentially lots more.

const char kListElements[] = "li ol ul ";
const char kDeclarationElements[] = "dl dt dd ";

const char kParagraphTerminators[] =
    "address article aside blockquote dir div dl fieldset "
    "footer form h1 h2 h3 h4 h5 h6 header hgroup hr menu nav ol p "
    "pre section table ul";

// TODO(jmarantz): handle & test Ruby containment.
// const char kRubyElements[] = "ruby rt rp ";

}  // namespace

HtmlKeywords* HtmlKeywords::singleton_ = nullptr;

HtmlKeywords::HtmlKeywords() {
  InitEscapeSequences();
  InitAutoClose();
  InitContains();
  InitOptionallyClosedKeywords();
}

void HtmlKeywords::InitEscapeSequences() {
  unescape_insensitive_map_.set_deleted_key("");
  unescape_sensitive_map_.set_deleted_key("");
  escape_map_.set_deleted_key("");

  StringSetInsensitive case_sensitive_symbols;
  for (size_t i = 0; i < arraysize(kHtmlKeywordsSequences); ++i) {
    // Put all symbols in the case-sensitive map
    const HtmlKeywordsSequence& seq = kHtmlKeywordsSequences[i];
    unescape_sensitive_map_[seq.sequence] =
        reinterpret_cast<const char*>(seq.value);

    // Don't populate the case-insensitive map for symbols that we've
    // already determined are case-sensitive.
    if (case_sensitive_symbols.find(seq.sequence) ==
        case_sensitive_symbols.end()) {
      // If this symbol is already present in the insensitive map, then it
      // must be case-sensitive.  E.g. &AElig; and &aelig; are distinct.
      StringStringSparseHashMapInsensitive::iterator p =
          unescape_insensitive_map_.find(seq.sequence);
      if (p != unescape_insensitive_map_.end()) {
        // As this symbol is case-sensitive, we must remove it from the
        // case-insensitive map.  This way we will report an error for
        // &Aelig;, rather than &AElig; or &aelig; unpredictably.
        unescape_insensitive_map_.erase(p);
        case_sensitive_symbols.insert(seq.sequence);
      } else {
        unescape_insensitive_map_[seq.sequence] =
            reinterpret_cast<const char*>(seq.value);
      }

      // For now, we will only generate symbolic escaped-names for
      // single-byte sequences
      if (strlen(reinterpret_cast<const char*>(seq.value)) == 1) {
        escape_map_[reinterpret_cast<const char*>(seq.value)] = seq.sequence;
      }
    }
  }

  // Initialialize the keywords from HtmlName into a reverse table.  This could
  // have been generated by gperf, but it isn't.  It's easy enough to build it
  // given an iterator.
  keyword_vector_.resize(HtmlName::num_keywords());
  for (HtmlName::Iterator iter; !iter.AtEnd(); iter.Next()) {
    DCHECK_GE(HtmlName::num_keywords(), iter.keyword());
    keyword_vector_[iter.keyword()] = StringPiece(iter.name());
  }

  // Make sure the multi-byte sequences are in order (case-insensitive).  Ties
  // are OK to account for case differences.
  for (size_t i = 1; i < arraysize(kHtmlKeywordsSequencesMultiByte); ++i) {
    DCHECK(StringCaseCompare(kHtmlKeywordsSequencesMultiByte[i - 1].sequence,
                             kHtmlKeywordsSequencesMultiByte[i].sequence) <= 0);
  }
}

void HtmlKeywords::Init() {
  if (singleton_ == nullptr) {
    singleton_ = new HtmlKeywords();
  }
}

void HtmlKeywords::ShutDown() {
  if (singleton_ != nullptr) {
    delete singleton_;
    singleton_ = nullptr;
  }
}

StringPiece HtmlKeywords::UnescapeHelper(const StringPiece& escaped,
                                         GoogleString* buf,
                                         bool* decoding_error) const {
  if (escaped.empty()) {
    *decoding_error = false;
    return escaped;
  }
  *decoding_error = true;

  // It would be nice to short-circuit the loop below via a memchr
  // looking for "&".  Unfortunately we must at least also scan for
  // 8-bit characters, as we cannot unescape those in a manner that's
  // bidirectionally safe.  Consider CanonicalAttributesTest.Spanish,
  // where a non-utf8 multi-byte 8-bit character is present.  If we
  // short-circuit looking for "&" we'll wind up escaping each piece
  // of the multi-byte sequence individually and that will not reverse
  // properly.
  // if (memchr(escaped.data(), '&', escaped.size()) == NULL) {
  //   return escaped;
  // }

  buf->clear();

  // Attribute values may have HTML escapes in them, e.g.
  //    href="host.com/path?v1&amp;v2"
  // Un-escape the attribute value here before populating the
  // attribute data structure.
  GoogleString escape;
  uint32 numeric_value = 0;
  bool accumulate_numeric_code = false;
  bool hex_mode = false;
  bool in_escape = false;
  bool found_ampersand = false;
  for (size_t i = 0; i < escaped.size(); ++i) {
    uint8 ch = static_cast<uint8>(escaped[i]);
    if (!in_escape) {
      if (ch == '&') {
        if (!found_ampersand) {
          found_ampersand = true;
          buf->append(escaped.data(), i);
        }
        in_escape = true;
        escape.clear();
        numeric_value = 0;
        accumulate_numeric_code = false;
        hex_mode = false;
      } else if (ch > 127) {
        return StringPiece(nullptr, 0);
      } else if (found_ampersand) {
        *buf += ch;
      }
    } else if (escape.empty() && (ch == '#')) {
      escape += ch;
      accumulate_numeric_code = true;
      if (((i + 1) < escaped.size()) && (UpperChar(escaped[i + 1]) == 'X')) {
        hex_mode = true;
        ++i;
      }
    } else if (ch == ';') {
      if (!TryUnescape(accumulate_numeric_code, numeric_value, escape, true,
                       buf)) {
        return StringPiece(nullptr, 0);
      }
      in_escape = false;
    } else if (ch > 127) {
      return StringPiece(nullptr, 0);
    } else {
      // Accumulate an escape code based on the current mode (symbolic, decimal,
      // or hex).  If the character is not plausible to accumulate into the
      // escape sequence, then consider it improperly terminated and try to
      // look it up.
      bool improperly_terminated = false;
      if (accumulate_numeric_code) {
        improperly_terminated =
            ((hex_mode && !AccumulateHexValue(ch, &numeric_value)) ||
             (!hex_mode && !AccumulateDecimalValue(ch, &numeric_value)));
      } else if (!IsAsciiAlphaNumeric(ch)) {
        improperly_terminated = true;
      }
      if (improperly_terminated) {
        if (TryUnescape(accumulate_numeric_code, numeric_value, escape, false,
                        buf)) {
          DCHECK_LT(static_cast<size_t>(0), i);
          --i;  // re-examine ch now that we've unescaped what came before it.
          in_escape = false;
        } else {
          return StringPiece(nullptr, 0);
        }
      } else {
        escape += ch;
      }
    }
  }
  if (!found_ampersand) {
    *decoding_error = false;
    return escaped;
  }
  if (in_escape) {
    if (escape.empty()) {
      buf->push_back('&');
    } else if (!TryUnescape(accumulate_numeric_code, numeric_value, escape,
                            false, buf)) {
      return StringPiece(nullptr, 0);
    }
  }
  *decoding_error = false;
  return StringPiece(*buf);
}

bool HtmlKeywords::TryUnescape(bool accumulate_numeric_code,
                               uint32 numeric_value, const GoogleString& escape,
                               bool was_terminated, GoogleString* buf) const {
  if (accumulate_numeric_code && (escape.size() > 1)) {
    // See the table in the Description in
    // http://en.wikipedia.org/wiki/UTF-8#Codepage_layout .
    if (numeric_value <= 0xff) {
      *buf += static_cast<char>(numeric_value);
    } else {
      return false;
    }
  } else {
    // Some symbols are case-sensitive (AElig vs aelig are different
    // code-points) whereas some are case-insensitive (&quot; and
    // &QUOT; both work.  So do the case-sensitive lookup first, and
    // if that fails, do an insensitive lookup.
    StringStringSparseHashMapSensitive::const_iterator p =
        unescape_sensitive_map_.find(escape);
    if (p != unescape_sensitive_map_.end()) {
      *buf += p->second;
    } else {
      // The sensitive lookup failed, but allow, for example, &QUOT; to work
      // in place of &quot;.  However, note that "yuml" is single
      // byte and "Yuml" is multi-byte, and we first look in our multi-byte
      // set before allowing doing the case-insensitive lookup in our
      // single-byte encodings list.
      if (std::binary_search(kHtmlKeywordsSequencesMultiByte,
                             (kHtmlKeywordsSequencesMultiByte +
                              arraysize(kHtmlKeywordsSequencesMultiByte)),
                             escape.c_str(), HtmlMultiByteSequenceCompare())) {
        // Reject known multi-byte
        // sequences e.g. (&hellip;), as we would wind up adding an extra &amp;
        // e.g. (&amp;&hellip;).
        return false;
      } else {
        // Handle the case the user has used the wrong case for a
        // valid escape sequence, e.g. QUOT;, but there is no
        // multi-byte match (e.g. Yuml;).  We can allow sloppy
        // interpretation with a case insensitive lookup here.
        StringStringSparseHashMapInsensitive::const_iterator q =
            unescape_insensitive_map_.find(escape);
        if (q != unescape_insensitive_map_.end()) {
          *buf += q->second;
        } else {
          // &apos; is a special case.  It is *not* legal HTML but many
          // web designers think it is.  It does not work on IE.  So
          // recognize that & fix it.
          // http://fishbowl.pastiche.org/2003/07/01/the_curse_of_apos/
          //
          // This is special-cased here because we want to unescape &apos;
          // into a single-quote, but do not want to re-escape ' into &apos;,
          // but instead into &#39;.  If we put that in the keyword map
          // above then we'd get the reverse mapping we don't want.
          //
          // TODO(jmarantz): consider allowing ' to &apos; for XHTML.
          if (StringCaseEqual(escape, "apos")) {
            *buf += '\'';
          } else {
            // Let random words through (e.g. accept a&b literal, which we would
            // re-escape & "improve" as a&amp;b).
            *buf += "&";
            *buf += escape;
            if (was_terminated) {
              *buf += ";";
            }
          }
        }
      }
    }
  }
  return true;
}

StringPiece HtmlKeywords::EscapeHelper(const StringPiece& unescaped,
                                       GoogleString* buf) const {
  if (unescaped.data() == nullptr) {
    return unescaped;
  }
  buf->clear();

  GoogleString char_to_escape;
  for (size_t i = 0; i < unescaped.size(); ++i) {
    int ch = static_cast<unsigned char>(unescaped[i]);
    // According to http://www.htmlescape.net/htmlescape_tool.html,
    // single-quote does not need to be escaped.  However, input HTML
    // might have used single-quote to quote attribute values, in
    // which case we better escape any single-quotes in the value.
    //
    // This function, unfortunately, does not know what quoting was used.
    // TODO(jmarantz): in remove_quotes filter, switch between ' and " for
    // quoting based on whatever is in the attr value.
    if (!IsHtmlSpace(ch) &&
        ((ch > 127) || (ch < 32) || (ch == '"') || (ch == '\'') ||
         (ch == '&') || (ch == '<') || (ch == '>'))) {
      char_to_escape.clear();
      char_to_escape += ch;
      StringStringSparseHashMapSensitive::const_iterator p =
          escape_map_.find(char_to_escape);
      if (p == escape_map_.end()) {
        absl::StrAppendFormat(buf, "&#%02d;", static_cast<int>(ch));
      } else {
        *buf += '&';
        *buf += p->second;
        *buf += ';';
      }
    } else {
      *buf += unescaped[i];
    }
  }
  return StringPiece(*buf);
}

void HtmlKeywords::AddCrossProduct(const StringPiece& k1_list,
                                   const StringPiece& k2_list,
                                   KeywordPairVec* kmap) {
  StringPieceVector v1, v2;
  SplitStringPieceToVector(k1_list, " ", &v1, true);
  SplitStringPieceToVector(k2_list, " ", &v2, true);
  for (int i = 0, n1 = v1.size(); i < n1; ++i) {
    HtmlName::Keyword k1 = HtmlName::Lookup(v1[i]);
    DCHECK_NE(HtmlName::kNotAKeyword, k1) << v1[i];
    for (int j = 0, n2 = v2.size(); j < n2; ++j) {
      HtmlName::Keyword k2 = HtmlName::Lookup(v2[j]);
      DCHECK_NE(HtmlName::kNotAKeyword, k2) << v2[j];
      KeywordPair k1_k2 = MakeKeywordPair(k1, k2);
      kmap->push_back(k1_k2);
    }
  }
}

void HtmlKeywords::AddToSet(const StringPiece& klist, KeywordVec* kset) {
  StringPieceVector v;
  SplitStringPieceToVector(klist, " ", &v, true);
  for (int i = 0, n = v.size(); i < n; ++i) {
    HtmlName::Keyword k = HtmlName::Lookup(v[i]);
    DCHECK_NE(HtmlName::kNotAKeyword, k) << v[i];
    kset->push_back(k);
  }
}

namespace {

// Sorts the passed-in vector to enable binary_search.  The vector is
// sorted by T::operator<.  If in the future a binary search requires
// a custom comparator then this function should also be changed to
// take that comparator.
//
// vec must not be empty.
template <class T>
void PrepareForBinarySearch(std::vector<T>* vec) {
  CHECK(!vec->empty());
  std::sort(vec->begin(), vec->end());
  // Make sure there are no duplicates
#ifndef NDEBUG
  typename std::vector<T>::iterator p = std::unique(vec->begin(), vec->end());
  if (p != vec->end()) {
    T duplicate_value = *p;
    LOG(DFATAL) << "Duplicate set element " << duplicate_value;
  }
#endif
}

}  // namespace

void HtmlKeywords::InitAutoClose() {
  // The premise of our lookup machinery is that HtmlName::Keyword can
  // be represented in a 16-bit int, so that we can make a pair using
  // SHIFT+OR.
  DCHECK_EQ(HtmlName::num_keywords(), HtmlName::num_keywords() & 0xffff);

  // TODO(jmarantz): these deserve another pass through the HTML5 spec.
  // Note that http://www.w3.org/TR/html5/syntax.html#optional-tags
  // covers many of these cases, but omits the general situation that
  // formatting elements should be automatically closed when they
  // hit most other tags.
  //
  // However, there is discussion of relevance in and around:
  // http://www.w3.org/TR/html5/the-end.html#misnested-tags:-b-i-b-i

  AddAutoClose(kTableLeaves, kTableLeaves);
  AddAutoClose(kTableLeaves, "tr");
  AddAutoClose("tr", kTableSections);
  AddAutoClose("tr", "tr");
  AddAutoClose(kTableSections, kTableSections);

  AddAutoClose("p", kParagraphTerminators);

  AddAutoClose("li", "li");
  AddAutoClose("dd dt", "dd dt");
  AddAutoClose("rp rt", "rp rt");
  AddAutoClose("optgroup", "optgroup");
  AddAutoClose("option", "optgroup option");
  AddAutoClose(kFormattingElements,
               StrCat("tr ", kListElements, kDeclarationElements));
  PrepareForBinarySearch(&auto_close_);
}

void HtmlKeywords::InitContains() {
  // TODO(jmarantz): these deserve another pass through the HTML5 spec.  Note
  // that the HTML5 spec doesn't have a 'containment' section but there is
  // discussion of the context in which tags can reside in the doc for each
  // tag, and discussion of relevance in and around:
  // http://www.w3.org/TR/html5/the-end.html#misnested-tags:-b-i-b-i
  //
  // Also see http://www.whatwg.org/specs/web-apps/current-work
  // /multipage/syntax.html#optional-tags which describes auto-closing
  // elements whose parents have no more content.

  AddContained(kTableLeaves, "table");
  AddContained("tr", "table");
  AddContained(kTableSections, "table");
  AddContained("li", "ul ol");
  AddContained("dd dt", "dl");
  AddContained("rt rp", "ruby");
  AddContained(kFormattingElements, "td th");
  PrepareForBinarySearch(&contained_);
}

// These tags do not need to be explicitly closed, but can be.  See
// http://www.w3.org/TR/html5/syntax.html#optional-tags .  Note that
// this is *not* consistent with
// http://www.w3schools.com/tags/tag_p.asp which claims that the <p>
// tag works the same in XHTML as HTML.  This is clearly wrong since
// real XHTML has XML syntax which requires explicit closing tags.
//
// Note that we will close any of these tags without warning.
void HtmlKeywords::InitOptionallyClosedKeywords() {
  AddToSet(kFormattingElements, &optionally_closed_);
  AddToSet("body colgroup dd dt html optgroup option p", &optionally_closed_);
  AddToSet(kListElements, &optionally_closed_);
  AddToSet(kTableElements, &optionally_closed_);
  PrepareForBinarySearch(&optionally_closed_);
}

bool HtmlKeywords::WritePre(StringPiece str, StringPiece style, Writer* writer,
                            MessageHandler* handler) {
  GoogleString tag, escaped;
  if (style.empty()) {
    tag = "<pre>";
  } else {
    tag = StrCat("<pre style=\"", style, "\">");
  }
  bool ret = writer->Write(tag, handler);
  ret &= writer->Write(HtmlKeywords::Escape(str, &escaped), handler);
  ret &= writer->Write("</pre>\n", handler);
  return ret;
}

}  // namespace net_instaweb
